Analyse transcript level data from 5’ sensing comparisons and do some exploratory data analysis.
Read in data: htseq-count files created with usegalaxy.eu
htseq_CDS <- read.delim("input/Galaxy151-[CDS_htseqCount].tabular", header=TRUE, row.names=1)
htseq_TUs <- read.delim("input/Galaxy154-[TU_htseqCount].tabular", header=TRUE, row.names=1)
zuordnung <- read.delim("input/FileNames_Galaxy-transcript-history.csv", header=TRUE)
row.names(zuordnung) <- names(htseq_CDS)[c(2,3,1,8,7,6,5,4)]
names(htseq_CDS) <- zuordnung[names(htseq_CDS),]$strain
names(htseq_TUs) <- zuordnung[names(htseq_TUs),]$strain
coldata <- read.csv("input/colData_transcript.csv", row.names=1)
htseq_CDS <- htseq_CDS[,row.names(coldata)]
htseq_TUs <- htseq_TUs[,row.names(coldata)]
Do actual analysis
# create DESeq2 data object
ddsMat_CDS <- DESeqDataSetFromMatrix(countData = htseq_CDS,
colData = coldata,
design = ~ strain)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
ddsMat_TUs <- DESeqDataSetFromMatrix(countData = htseq_TUs,
colData = coldata,
design = ~ strain)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
# run DESeq
ddsMat_CDS <- DESeq(ddsMat_CDS)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
ddsMat_TUs <- DESeq(ddsMat_TUs)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
write.csv(data.frame(factor=ddsMat_CDS$sizeFactor), file="output/transcript_sizeFactors.csv")
write.csv(data.frame(counts(ddsMat_CDS, normalized=TRUE)), file="output/Transcript_CDS_normalizedCounts.csv")
plotDispEsts(ddsMat_CDS, main="CDS comparison", xlab="Mean of Normalized Counts", ylab="Dispersion")
plotDispEsts(ddsMat_TUs, main="TU comparison", xlab="Mean of Normalized Counts", ylab="Dispersion")
pdf(file="output/DESeq2_Plots/CDS/ddsMat_CDS_DispEsts.pdf", width=4.5, height=4.5)
plotDispEsts(ddsMat_CDS, main="CDS comparison", xlab="Mean of Normalized Counts", ylab="Dispersion")
dev.off()
## png
## 2
pdf(file="output/DESeq2_Plots/TU/ddsMat_TU_DispEsts.pdf", width=4.5, height=4.5)
plotDispEsts(ddsMat_TUs, main="TU comparison", xlab="Mean of Normalized Counts", ylab="Dispersion")
dev.off()
## png
## 2
p <- PCA_plot(ddsMat_CDS, "RNA Features")
p
ggsave("output/DESeq2_Plots/CDS/ddsMat_CDS_PCA.pdf", plot=p, width=9, height=9, units="cm")
p <- PCA_plot(ddsMat_TUs, "TU")
p
ggsave("output/DESeq2_Plots/TU/ddsMat_TU_PCA.pdf", plot=p, width=9, height=9, units="cm")
p <- heatmap_plot(ddsMat_CDS, "RNA Features")
p
ggsave("output/DESeq2_Plots/CDS/ddsMat_CDS_heatMap.pdf", plot=p, width=15, height=12, units="cm")
p <- heatmap_plot(ddsMat_TUs, "TU")
p
ggsave("output/DESeq2_Plots/TU/ddsMat_TU_heatMap.pdf", plot=p, width=15, height=12, units="cm")
if(FALSE){
transcript_coverage <- read.delim("input/transcript_coverage_combined_5sensing.txt", header=TRUE, row.names=1)
# create DESeq2 data object
ddsMat_transc <- DESeqDataSetFromMatrix(countData = transcript_coverage,
colData = coldata,
design = ~ strain)
ddsMat_transc$sizeFactor <- ddsMat_CDS$sizeFactor
transc_norm_counts <- counts(ddsMat_transc, normalized=TRUE)
rm(ddsMat_transc)
grp_File <- data.frame("WT"=apply(transc_norm_counts[,c("WT1", "WT2", "WT3")],1,mean), "dWT"=apply(transc_norm_counts[,c("dWT1", "dWT2", "dWT3")],1,mean), "TV"=apply(transc_norm_counts[,c("TV1", "TV2")],1,mean))
grp_File$seqname <- rep(NA, length(grp_File$WT))
seqnames <- c("BA000022.2", "AP004310.1", "AP004311.1", "AP004312.1", "AP006585.1")
for(i in seqnames){
grp_File[which(grepl(i, row.names(grp_File))),]$seqname <- i
}
grp_File$strand <- rep(NA, length(grp_File$WT))
for(i in c("plus", "minus")){
grp_File[which(grepl(i, row.names(grp_File))),]$strand <- i
}
for(i in seqnames){
for(j in c("plus", "minus")){
tmp <- subset(grp_File, grp_File$seqname==i & grp_File$strand==j)[,c("WT", "dWT", "TV")]
s=""
if(j=="plus"){
s="fw"
}else{
s="rev"
}
write.table(tmp, file=paste("output/grp/transcript/", i, "_transcript_WT-dWT-TV_", s, ".grp", sep=""), sep="\t", row.names=FALSE, col.names = FALSE)
}
}
}
Extract results and use changeAnnotation_DESeq2.py and annotation_locusTags_stand13012021.csv to add annotation to tables.
# extract results
CDS_result_dWT_TV <- results(ddsMat_CDS, contrast=c("strain", "dWT", "TV")) # dWT/TV -> higher in dWT: higher log2FC
write_tsv(rownames_to_column(as.data.frame(CDS_result_dWT_TV[order(CDS_result_dWT_TV$padj),])), file="output/DESeq2_resultsTables/results_CDS-dWT-TV.tsv")
CDS_result_dWT_WT <- results(ddsMat_CDS, contrast=c("strain", "dWT", "WT")) # dWT/WT -> higher in dWT: higher log2FC
write_tsv(rownames_to_column(as.data.frame(CDS_result_dWT_WT[order(CDS_result_dWT_WT$padj),])), file="output/DESeq2_resultsTables/results_CDS-dWT-WT.tsv")
TUs_result_dWT_TV <- results(ddsMat_TUs, contrast=c("strain", "dWT", "TV")) # dWT/TV -> higher in dWT: higher log2FC
write_tsv(rownames_to_column(as.data.frame(TUs_result_dWT_TV[order(TUs_result_dWT_TV$padj),])), file="output/DESeq2_resultsTables/results_TUs-dWT-TV.tsv")
TUs_result_dWT_WT <- results(ddsMat_TUs, contrast=c("strain", "dWT", "WT")) # dWT/WT -> higher in dWT: higher log2FC
write_tsv(rownames_to_column(as.data.frame(TUs_result_dWT_WT[order(TUs_result_dWT_WT$padj),])), file="output/DESeq2_resultsTables/results_TUs-dWT-WT.tsv")
Execute in directory “DESeq2_resultsTables”: python changeAnnotation_DESeq2.py results_CDS-dWT-TV.tsv results_CDS-dWT-TV_annotated.tsv python changeAnnotation_DESeq2.py results_CDS-dWT-WT.tsv results_CDS-dWT-WT_annotated.tsv
python TUs_add_info.py results_TUs-dWT-TV.tsv results_TUs-dWT-TV_annotated.tsv python TUs_add_info.py results_TUs-dWT-WT.tsv results_TUs-dWT-WT_annotated.tsv
pvaluePlot(CDS_result_dWT_TV, "CDS rne(WT)-rne(5p)")
pvaluePlot(CDS_result_dWT_WT, "CDS rne(WT)-WT")
pvaluePlot(TUs_result_dWT_TV, "rne(WT)-rne(5p)")
pvaluePlot(TUs_result_dWT_WT, "TUs rne(WT)-WT")
## png
## 2
## png
## 2
## png
## 2
## png
## 2
CDS_result_dWT_TV <- subset(CDS_result_dWT_TV, !is.na(CDS_result_dWT_TV$padj))
nrow(CDS_result_dWT_TV)
## [1] 4411
CDS_result_dWT_WT <- subset(CDS_result_dWT_WT, !is.na(CDS_result_dWT_WT$padj))
nrow(CDS_result_dWT_WT)
## [1] 4519
TUs_result_dWT_TV <- subset(TUs_result_dWT_TV, !is.na(TUs_result_dWT_TV$padj))
nrow(TUs_result_dWT_TV)
## [1] 3000
TUs_result_dWT_WT <- subset(TUs_result_dWT_WT, !is.na(TUs_result_dWT_WT$padj))
nrow(TUs_result_dWT_WT)
## [1] 3699
count_up_down(CDS_result_dWT_TV, foldchange=0.8, padjusted=0.05)
## [1] "number of features down: 36"
## [1] "number of features up: 32"
p <- volcanoPlot_ggplot(as.data.frame(CDS_result_dWT_TV), foldchange=0.8, padjusted=0.05, text=TRUE)
p
## Warning: Removed 4343 rows containing missing values (geom_text_repel).
## Warning: ggrepel: 53 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
ggsave("output/DESeq2_Plots/CDS/ddsMat_CDS-dWT-TV_VolcanoPlot.pdf", plot=p, width=15, height=12, units="cm")
## Warning: Removed 4343 rows containing missing values (geom_text_repel).
## Warning: ggrepel: 63 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
p <- MAplot_ggplot(CDS_result_dWT_TV, foldchange=0.8, y_axis_label = "Log2 fold-change(rne(WT)/rne(5p))")# + ylim(-5.5,+5.5)
p
ggsave("output/DESeq2_Plots/CDS/ddsMat_CDS-dWT-TV_MAplot.pdf",plot=p, width=15, height=12, units="cm")
count_up_down(CDS_result_dWT_WT, foldchange=0.8, padjusted=0.05)
## [1] "number of features down: 76"
## [1] "number of features up: 87"
p <- volcanoPlot_ggplot(as.data.frame(CDS_result_dWT_WT), foldchange=0.8, padjusted=0.05, text=TRUE)
p
## Warning: Removed 4356 rows containing missing values (geom_text_repel).
## Warning: ggrepel: 132 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
ggsave("output/DESeq2_Plots/CDS/ddsMat_CDS-dWT-WT_VolcanoPlot.pdf", plot=p, width=15, height=12, units="cm")
## Warning: Removed 4356 rows containing missing values (geom_text_repel).
## Warning: ggrepel: 147 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
p <- MAplot_ggplot(CDS_result_dWT_WT, foldchange=0.8, y_axis_label = "Log2 fold-change(rne(WT)/WT)")# + ylim(-5.5,+5.5)
p
ggsave("output/DESeq2_Plots/CDS/ddsMat_CDS-dWT-WT_MAplot.pdf",plot=p, width=15, height=12, units="cm")
count_up_down(TUs_result_dWT_TV, foldchange=0.8, padjusted=0.05)
## [1] "number of features down: 30"
## [1] "number of features up: 29"
p <- volcanoPlot_ggplot(as.data.frame(TUs_result_dWT_TV), foldchange=0.8, padjusted=0.05)
p
ggsave("output/DESeq2_Plots/TU/ddsMat_TUs-dWT-TV_VolcanoPlot.pdf", plot=p, width=15, height=12, units="cm")
p <- MAplot_ggplot(TUs_result_dWT_TV, foldchange=0.8, y_axis_label = "Log2 fold-change(rne(WT)/rne(5p))")# + ylim(-5.5,+5.5)
p
ggsave("output/DESeq2_Plots/TU/ddsMat_TUs-dWT-TV_MAplot.pdf",plot=p, width=15, height=12, units="cm")
count_up_down(TUs_result_dWT_WT, foldchange=0.8, padjusted=0.05)
## [1] "number of features down: 52"
## [1] "number of features up: 106"
p <- volcanoPlot_ggplot(as.data.frame(TUs_result_dWT_WT), foldchange=0.8, padjusted=0.05)
p
ggsave("output/DESeq2_Plots/TU/ddsMat_TUs-dWT-WT_VolcanoPlot.pdf", plot=p, width=15, height=12, units="cm")
p <- MAplot_ggplot(TUs_result_dWT_WT, foldchange=0.8, y_axis_label = "Log2 fold-change(rne(WT)/WT)")# + ylim(-5.5,+5.5)
p
ggsave("output/DESeq2_Plots/TU/ddsMat_TUs-dWT-WT_MAplot.pdf",plot=p, width=15, height=12, units="cm")
features <- rtracklayer::import("input/20210217_syne_onlyUnique_withFeat.gff3")
CDS_features <- subset(features, features$type=="CDS")
TUs <- rtracklayer::import("input/Kopf_4091_TUs_combined.gff3")
types <- rep(c("CDS","5UTR", "3UTR", "tRNA", "rRNA", "ncRNA", "asRNA", "CRISPR", "misc"),2)
updown <- c(rep("up",9), rep("down",9))
df_features <- data.frame(cbind(type=types, updown=updown))
types <- c("CDS","5UTR", "3UTR", "tRNA", "rRNA", "ncRNA", "asRNA", "CRISPR", "misc")
for(t in types){
t_feat <- subset(features, features$type==t)
subset_CDS <- subset(CDS_result_dWT_TV, row.names(CDS_result_dWT_TV) %in% t_feat$locus_tag)
index_up <- which(df_features$type==t & df_features$updown=="up")
index_down <- which(df_features$type==t & df_features$updown=="down")
df_features[index_up,"number_feat_overlap"] <- nrow(subset(subset_CDS, subset_CDS$padj<0.05 & subset_CDS$log2FoldChange>0.8)) # count number of features affected
df_features[index_down,"number_feat_overlap"] <- nrow(subset(subset_CDS, subset_CDS$padj<0.05 & subset_CDS$log2FoldChange<(-0.8))) # count number of features affected
df_features[df_features$type==t,"number_total"] <- length(t_feat)
}
df_features
## type updown number_feat_overlap number_total
## 1 CDS up 13 3675
## 2 5UTR up 0 979
## 3 3UTR up 2 29
## 4 tRNA up 0 42
## 5 rRNA up 0 6
## 6 ncRNA up 7 318
## 7 asRNA up 9 1071
## 8 CRISPR up 1 3
## 9 misc up 0 11
## 10 CDS down 26 3675
## 11 5UTR down 1 979
## 12 3UTR down 0 29
## 13 tRNA down 0 42
## 14 rRNA down 2 6
## 15 ncRNA down 5 318
## 16 asRNA down 2 1071
## 17 CRISPR down 0 3
## 18 misc down 0 11
write.csv(df_features, file="output/RNAfeatures_upDown_dWT_TV.csv")
# prepare data.frame for barplot
types <- rep(c("CDS","5UTR", "3UTR", "tRNA", "rRNA", "ncRNA", "asRNA", "CRISPR", "misc"),2)
updown <- c(rep("up",9), rep("down",9))
df_features <- data.frame(cbind(type=types, updown=updown))
types <- c("CDS","5UTR", "3UTR", "tRNA", "rRNA", "ncRNA", "asRNA", "CRISPR", "misc")
for(t in types){
t_feat <- subset(features, features$type==t)
subset_CDS <- subset(CDS_result_dWT_WT, row.names(CDS_result_dWT_WT) %in% t_feat$locus_tag)
index_up <- which(df_features$type==t & df_features$updown=="up")
index_down <- which(df_features$type==t & df_features$updown=="down")
df_features[index_up,"number_feat_overlap"] <- nrow(subset(subset_CDS, subset_CDS$padj<0.05 & subset_CDS$log2FoldChange>0.8)) # count number of features affected
df_features[index_down,"number_feat_overlap"] <- nrow(subset(subset_CDS, subset_CDS$padj<0.05 & subset_CDS$log2FoldChange<(-0.8))) # count number of features affected
df_features[df_features$type==t,"number_total"] <- length(t_feat)
}
df_features
## type updown number_feat_overlap number_total
## 1 CDS up 72 3675
## 2 5UTR up 4 979
## 3 3UTR up 0 29
## 4 tRNA up 0 42
## 5 rRNA up 0 6
## 6 ncRNA up 3 318
## 7 asRNA up 5 1071
## 8 CRISPR up 3 3
## 9 misc up 0 11
## 10 CDS down 54 3675
## 11 5UTR down 6 979
## 12 3UTR down 0 29
## 13 tRNA down 0 42
## 14 rRNA down 0 6
## 15 ncRNA down 11 318
## 16 asRNA down 5 1071
## 17 CRISPR down 0 3
## 18 misc down 0 11
write.csv(df_features, file="output/RNAfeatures_upDown_dWT_WT.csv")
go_functional_enrichment(CDS_result_dWT_TV, write=TRUE, path_up="output/enrichment/go_enrichment/dWT_TV_up.csv", path_down="output/enrichment/go_enrichment/dWT_TV_down.csv")
## [1] ID Description GeneRatio BgRatio pvalue p.adjust
## [7] qvalue geneID Count
## <0 rows> (or 0-length row.names)
## [1] ID Description GeneRatio BgRatio pvalue p.adjust
## [7] qvalue geneID Count
## <0 rows> (or 0-length row.names)
go_functional_enrichment(CDS_result_dWT_WT, write=TRUE, path_up="output/enrichment/go_enrichment/dWT_WT_up.csv", path_down="output/enrichment/go_enrichment/dWT_WT_down.csv")
## ID Description GeneRatio BgRatio pvalue
## GO:0008272 GO:0008272 sulfate transport 3/40 10/2432 0.0004569312
## GO:0004519 GO:0004519 endonuclease activity 4/40 24/2432 0.0005262848
## p.adjust qvalue geneID Count
## GO:0008272 0.01447283 0.01357261 slr1452/slr1453/slr1454 3
## GO:0004519 0.01447283 0.01357261 slr0393/slr1129/slr7068/slr7071 4
## ID Description GeneRatio BgRatio
## GO:0020037 GO:0020037 heme binding 4/44 24/2432
## GO:0022904 GO:0022904 respiratory electron transport chain 3/44 12/2432
## pvalue p.adjust qvalue geneID
## GO:0020037 0.0007613007 0.0374427 0.03141645 sll0450/slr0898/slr1136/slr1137
## GO:0022904 0.0010852956 0.0374427 0.03141645 sll0450/slr1136/slr1137
## Count
## GO:0020037 4
## GO:0022904 3
kegg_functional_enrichment(CDS_result_dWT_TV, write=TRUE, path_up="output/enrichment/kegg_enrichment/dWT_TV_up.csv", path_down="output/enrichment/kegg_enrichment/dWT_TV_down.csv")
## Reading KEGG annotation online:
##
## Reading KEGG annotation online:
## [1] ID Description GeneRatio BgRatio pvalue p.adjust
## [7] qvalue geneID Count
## <0 rows> (or 0-length row.names)
## [1] ID Description GeneRatio BgRatio pvalue p.adjust
## [7] qvalue geneID Count
## <0 rows> (or 0-length row.names)
kegg_functional_enrichment(CDS_result_dWT_WT, write=TRUE, path_up="output/enrichment/kegg_enrichment/dWT_WT_up.csv", path_down="output/enrichment/kegg_enrichment/dWT_WT_down.csv")
## ID Description GeneRatio BgRatio pvalue p.adjust
## syn00920 syn00920 Sulfur metabolism 3/14 14/945 0.0008577528 0.009435281
## qvalue geneID Count
## syn00920 0.007223182 slr1452/slr1453/slr1454 3
## ID Description GeneRatio BgRatio pvalue
## syn00910 syn00910 Nitrogen metabolism 5/20 18/945 1.795269e-05
## p.adjust qvalue geneID
## syn00910 0.0004308646 0.0003401562 sll0450/sll1450/sll1451/slr0898/slr0899
## Count
## syn00910 5
dWT_TV_go_gsea <- go_gsea(CDS_result_dWT_TV, write=TRUE, path="output/enrichment/go_gsea/dWT_TV_go_gsea.csv")
## preparing geneSet collections...
## GSEA analysis...
## leading edge analysis...
## done...
## ID Description setSize
## GO:0005840 GO:0005840 ribosome 54
## GO:0003735 GO:0003735 structural constituent of ribosome 55
## GO:0006412 GO:0006412 translation 89
## GO:0019843 GO:0019843 rRNA binding 37
## enrichmentScore NES pvalue p.adjust qvalues rank
## GO:0005840 -0.6390628 -2.295449 3.421905e-06 0.000508704 0.0004954709 782
## GO:0003735 -0.6311004 -2.269506 5.847173e-06 0.000508704 0.0004954709 782
## GO:0006412 -0.4883304 -1.952793 2.118945e-04 0.011935156 0.0116246826 993
## GO:0019843 -0.6347054 -2.107158 2.743714e-04 0.011935156 0.0116246826 856
## leading_edge
## GO:0005840 tags=63%, list=18%, signal=52%
## GO:0003735 tags=62%, list=18%, signal=52%
## GO:0006412 tags=47%, list=23%, signal=37%
## GO:0019843 tags=62%, list=19%, signal=51%
dWT_WT_go_gsea <- go_gsea(CDS_result_dWT_WT, write=TRUE, path="output/enrichment/go_gsea/dWT_WT_go_gsea.csv")
## preparing geneSet collections...
## GSEA analysis...
## leading edge analysis...
## done...
## ID Description setSize
## GO:0005840 GO:0005840 ribosome 54
## GO:0003735 GO:0003735 structural constituent of ribosome 55
## GO:0019843 GO:0019843 rRNA binding 37
## GO:0006412 GO:0006412 translation 89
## GO:0003723 GO:0003723 RNA binding 61
## GO:0004519 GO:0004519 endonuclease activity 24
## enrichmentScore NES pvalue p.adjust qvalues rank
## GO:0005840 0.6623795 2.316038 2.294226e-07 4.014895e-05 3.477563e-05 480
## GO:0003735 0.6334220 2.215209 1.513803e-06 1.324578e-04 1.147303e-04 480
## GO:0019843 0.7021435 2.290831 5.304239e-06 3.094139e-04 2.680037e-04 471
## GO:0006412 0.5300910 2.042958 1.011786e-05 4.426562e-04 3.834135e-04 1072
## GO:0003723 0.5739667 2.048829 5.849954e-05 2.047484e-03 1.773460e-03 298
## GO:0004519 0.7017147 2.084257 1.213835e-04 3.244858e-03 2.810584e-03 698
## leading_edge
## GO:0005840 tags=44%, list=11%, signal=40%
## GO:0003735 tags=42%, list=11%, signal=38%
## GO:0019843 tags=49%, list=10%, signal=44%
## GO:0006412 tags=48%, list=24%, signal=38%
## GO:0003723 tags=18%, list=7%, signal=17%
## GO:0004519 tags=42%, list=15%, signal=35%
dWT_TV_kegg_gsea <- kegg_gsea(CDS_result_dWT_TV, write=TRUE, path="output/enrichment/kegg_gsea/dWT_TV_kegg_gsea.csv")
## preparing geneSet collections...
## GSEA analysis...
## leading edge analysis...
## done...
## ID Description setSize enrichmentScore NES pvalue
## syn03010 syn03010 Ribosome 53 -0.6267786 -2.265002 1.351246e-05
## p.adjust qvalues rank leading_edge
## syn03010 0.0008107473 0.0008107473 774 tags=60%, list=18%, signal=50%
dWT_WT_kegg_gsea <- kegg_gsea(CDS_result_dWT_WT, write=TRUE, path="output/enrichment/kegg_gsea/dWT_WT_kegg_gsea.csv")
## preparing geneSet collections...
## GSEA analysis...
## leading edge analysis...
## done...
## ID Description setSize enrichmentScore NES
## syn03010 syn03010 Ribosome 53 0.6172389 2.148302
## syn00910 syn00910 Nitrogen metabolism 18 -0.7406120 -2.021652
## syn00920 syn00920 Sulfur metabolism 14 0.7610731 1.948010
## syn00190 syn00190 Oxidative phosphorylation 49 -0.5079952 -1.767313
## pvalue p.adjust qvalues rank
## syn03010 1.234140e-05 0.0007528256 0.0006495475 480
## syn00910 8.109319e-05 0.0024733423 0.0021340313 899
## syn00920 1.511540e-03 0.0301079573 0.0259775300 648
## syn00190 1.974292e-03 0.0301079573 0.0259775300 978
## leading_edge
## syn03010 tags=40%, list=11%, signal=36%
## syn00910 tags=78%, list=20%, signal=63%
## syn00920 tags=57%, list=14%, signal=49%
## syn00190 tags=39%, list=22%, signal=31%
gseaplot2(dWT_TV_go_gsea, geneSetID=1:4)
p <- gseaplot2(dWT_TV_kegg_gsea, geneSetID =1)
p
ggsave("output/enrichment/Plots/dWT_TV_KEGG.pdf", plot=p, width=15, height=12, units="cm")
gseaplot2(dWT_WT_go_gsea, geneSetID=1:9)
p <- gseaplot2(dWT_WT_kegg_gsea, geneSetID =1:4)
p
ggsave("output/enrichment/Plots/dWT_WT_KEGG.pdf", plot=p, width=15, height=12, units="cm")
browseKEGGNew_3(dWT_TV_kegg_gsea, "syn03010", 1) #
browseKEGGNew_3(dWT_WT_kegg_gsea, "syn03010", 1) #
browseKEGGNew_3(dWT_WT_kegg_gsea, "syn00910", 2) #
browseKEGGNew_3(dWT_WT_kegg_gsea, "syn00920", 3) #
browseKEGGNew_3(dWT_WT_kegg_gsea, "syn00190", 4) #
First, a data frame in which feature names are assigned to their feature type (CDS, ncRNA, …) has to be created and the respective info read in.
df_featureType <- data.frame(feature_type=as.character(features$type), feature_name=features$locus_tag)
feature_type_gsea <- function(DESEq2_dataframe, write=FALSE, path=""){
locus_tags <- row.names(DESEq2_dataframe)
geneList <- DESEq2_dataframe$log2FoldChange
names(geneList) <- locus_tags
geneList = sort(geneList, decreasing = TRUE)
set.seed(42)
go_gsea_object <- GSEA(geneList, TERM2GENE = df_featureType, seed=TRUE)
print(head(go_gsea_object)[,1:10])
if(write){
write.csv(go_gsea_object, path)
}
return(go_gsea_object)
}
features_gsea_dWT_TV <- feature_type_gsea(CDS_result_dWT_TV, TRUE, "output/enrichment/GSEA_RNAfeatures_dWT-TV.csv")
## preparing geneSet collections...
## GSEA analysis...
## leading edge analysis...
## done...
## ID Description setSize enrichmentScore NES pvalue
## 3UTR 3UTR 3UTR 22 0.7332143 2.140823 1.025791e-05
## ncRNA ncRNA ncRNA 224 0.3159062 1.420087 9.746224e-03
## p.adjust qvalues rank leading_edge
## 3UTR 4.103165e-05 2.159561e-05 254 tags=55%, list=6%, signal=52%
## ncRNA 1.949245e-02 1.025918e-02 672 tags=29%, list=15%, signal=26%
features_gsea_dWT_WT <- feature_type_gsea(CDS_result_dWT_WT, TRUE, "output/enrichment/GSEA_RNAfeatures_dWT-WT.csv")
## preparing geneSet collections...
## GSEA analysis...
## leading edge analysis...
## done...
## ID Description setSize enrichmentScore NES pvalue
## ncRNA ncRNA ncRNA 236 -0.3814923 -1.693676 0.0000167076
## 3UTR 3UTR 3UTR 23 -0.5893611 -1.746820 0.0048635827
## tRNA tRNA tRNA 36 -0.4795011 -1.542110 0.0175544830
## 5UTR 5UTR 5UTR 310 -0.2801913 -1.286619 0.0181392166
## p.adjust qvalues rank leading_edge
## ncRNA 6.683038e-05 NA 948 tags=39%, list=21%, signal=33%
## 3UTR 9.727165e-03 NA 547 tags=48%, list=12%, signal=42%
## tRNA 1.813922e-02 NA 244 tags=25%, list=5%, signal=24%
## 5UTR 1.813922e-02 NA 880 tags=28%, list=19%, signal=25%
p <- gseaplot2(features_gsea_dWT_TV, geneSetID=1:2)
p
ggsave("output/enrichment/Plots/dWT_TV_RNAfeatures.pdf", plot=p, width=15, height=12, units="cm")
p <- gseaplot2(features_gsea_dWT_WT, geneSetID=1:4)
p
ggsave("output/enrichment/Plots/dWT_WT_RNAfeatures.pdf", plot=p, width=15, height=12, units="cm")
go_gsea_baseMeans <- function(DESEq2_dataframe, write=FALSE, path=""){
locus_tags <- row.names(DESEq2_dataframe)
geneList <- DESEq2_dataframe$baseMean
names(geneList) <- locus_tags
geneList = sort(geneList, decreasing = TRUE)
set.seed(42)
go_gsea_object <- GSEA(geneList, TERM2GENE = term_to_gene, TERM2NAME=term_to_name, seed=TRUE)
tryCatch({
print(head(go_gsea_object)[,1:10])
if(write){
write.csv(go_gsea_object, path)
}
return(go_gsea_object)}, error=function(e){
print("nothing enriched")
})
}
kegg_gsea_baseMean <- function(DESEq2_dataframe, write=FALSE, path=""){
locus_tags <- row.names(DESEq2_dataframe)
geneList <- DESEq2_dataframe$baseMean
names(geneList) <- locus_tags
geneList = sort(geneList, decreasing = TRUE)
set.seed(42)
kegg_gsea_object <- gseKEGG(geneList, organism="syn", minGSSize=10, pvalueCutoff = 0.05, seed=TRUE)
tryCatch({
print(head(kegg_gsea_object)[,1:10])
if(write){
write.csv(kegg_gsea_object, path)
}
return(kegg_gsea_object)}, error=function(e){
print("nothing enriched")
})
}
norm_counts <- counts(ddsMat_CDS, normalized=TRUE)
mean_norm_counts <- apply(norm_counts, 1, mean)
mean_norm_counts_CDS <- subset(mean_norm_counts, names(mean_norm_counts) %in% CDS_features$locus_tag)
CDS_feat_df <- as.data.frame(CDS_features)
row.names(CDS_feat_df) <- CDS_feat_df$locus_tag
mean_norm_counts_CDS_width <- mean_norm_counts_CDS/CDS_feat_df[names(mean_norm_counts_CDS),]$width
mean_norm_counts_CDS_width <- sort(mean_norm_counts_CDS_width, decreasing=TRUE)
set.seed(42)
go_gsea_baseMeans_width <- GSEA(mean_norm_counts_CDS_width, TERM2GENE=term_to_gene, TERM2NAME = term_to_name, nPermSimple=10000, seed=TRUE)
## preparing geneSet collections...
## GSEA analysis...
## Warning in fgseaMultilevel(...): There were 3 pathways for which P-values were
## not calculated properly due to unbalanced (positive and negative) gene-level
## statistic values. For such pathways pval, padj, NES, log2err are set to NA. You
## can try to increase the value of the argument nPermSimple (for example set it
## nPermSimple = 100000)
## no term enriched under specific pvalueCutoff...
set.seed(42)
kegg_gsea_baseMeans_width <- gseKEGG(mean_norm_counts_CDS_width, organism="syn", minGSSize=10, pvalueCutoff = 0.05, nPermSimple=10000, seed=TRUE)
## preparing geneSet collections...
## GSEA analysis...
## no term enriched under specific pvalueCutoff...
First, a data frame in which feature names are assigned to their feature type (CDS, ncRNA, …) has to be created and the respective info read in.
df_plasmid <- data.frame(plasmid=as.character(seqnames(TUs)), TU_ID=TUs$index)
plasmid_gsea <- function(DESEq2_dataframe, write=FALSE, path=""){
TU_IDs <- row.names(DESEq2_dataframe)
geneList <- DESEq2_dataframe$log2FoldChange
names(geneList) <- TU_IDs
geneList = sort(geneList, decreasing = TRUE)
set.seed(42)
go_gsea_object <- GSEA(geneList, TERM2GENE = df_plasmid, seed=TRUE)
print(head(go_gsea_object)[,1:10])
if(write){
write.csv(go_gsea_object, path)
}
return(go_gsea_object)
}
plasmid_gsea_dWT_TV <- plasmid_gsea(TUs_result_dWT_TV, TRUE, "output/enrichment/GSEA_plasmids_dWT-TV.csv")
## preparing geneSet collections...
## GSEA analysis...
## Warning in fgseaMultilevel(...): For some pathways, in reality P-values are less
## than 1e-10. You can set the `eps` argument to zero for better estimation.
## leading edge analysis...
## done...
## ID Description setSize enrichmentScore NES
## AP004310.1 AP004310.1 AP004310.1 85 -0.6146525 -2.559085
## AP004311.1 AP004311.1 AP004311.1 73 0.6089396 2.328756
## pvalue p.adjust qvalues rank
## AP004310.1 1.000000e-10 4.000000e-10 2.105263e-10 348
## AP004311.1 1.104233e-08 2.208465e-08 1.162350e-08 408
## leading_edge
## AP004310.1 tags=38%, list=12%, signal=34%
## AP004311.1 tags=47%, list=14%, signal=41%
plasmid_gsea_dWT_WT <- plasmid_gsea(TUs_result_dWT_WT, TRUE, "output/enrichment/GSEA_plasmids_dWT-WT.csv")
## preparing geneSet collections...
## GSEA analysis...
## Warning in fgseaMultilevel(...): For some pathways, in reality P-values are less
## than 1e-10. You can set the `eps` argument to zero for better estimation.
## leading edge analysis...
## done...
## ID Description setSize enrichmentScore NES pvalue
## AP004310.1 AP004310.1 AP004310.1 114 0.7903643 2.956229 1.000000e-10
## AP004311.1 AP004311.1 AP004311.1 90 0.9072201 3.281283 1.000000e-10
## AP006585.1 AP006585.1 AP006585.1 40 0.7232452 2.294462 1.580906e-07
## p.adjust qvalues rank leading_edge
## AP004310.1 2.000000e-10 5.263158e-11 453 tags=74%, list=12%, signal=67%
## AP004311.1 2.000000e-10 5.263158e-11 217 tags=86%, list=6%, signal=83%
## AP006585.1 2.107875e-07 5.547039e-08 398 tags=57%, list=11%, signal=52%
p <- gseaplot2(plasmid_gsea_dWT_TV, geneSetID=1:2)
p
ggsave("output/enrichment/Plots/dWT_TV_plasmids.pdf", plot=p, width=15, height=12, units="cm")
p <- gseaplot2(plasmid_gsea_dWT_WT, geneSetID=1:3)
p
ggsave("output/enrichment/Plots/dWT_WT_plasmids.pdf", plot=p, width=15, height=12, units="cm")
## R version 4.0.4 (2021-02-15)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Debian GNU/Linux 11 (bullseye)
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/atlas/libblas.so.3.10.3
## LAPACK: /usr/lib/x86_64-linux-gnu/atlas/liblapack.so.3.10.3
##
## locale:
## [1] LC_CTYPE=de_DE.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=de_DE.UTF-8 LC_COLLATE=de_DE.UTF-8
## [5] LC_MONETARY=de_DE.UTF-8 LC_MESSAGES=de_DE.UTF-8
## [7] LC_PAPER=de_DE.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=de_DE.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] forcats_0.5.1 stringr_1.4.0
## [3] dplyr_1.0.5 purrr_0.3.4
## [5] readr_1.4.0 tidyr_1.1.3
## [7] tibble_3.1.0 tidyverse_1.3.0
## [9] rtracklayer_1.50.0 enrichplot_1.10.2
## [11] clusterProfiler_3.18.1 RColorBrewer_1.1-2
## [13] pheatmap_1.0.12 ggpubr_0.4.0
## [15] ggrepel_0.9.1 ggplot2_3.3.3
## [17] DESeq2_1.30.1 SummarizedExperiment_1.20.0
## [19] Biobase_2.50.0 MatrixGenerics_1.2.1
## [21] matrixStats_0.58.0 GenomicRanges_1.42.0
## [23] GenomeInfoDb_1.26.2 IRanges_2.24.1
## [25] S4Vectors_0.28.1 BiocGenerics_0.36.0
## [27] knitr_1.31
##
## loaded via a namespace (and not attached):
## [1] readxl_1.3.1 shadowtext_0.0.7 backports_1.2.1
## [4] fastmatch_1.1-0 plyr_1.8.6 igraph_1.2.6
## [7] splines_4.0.4 BiocParallel_1.24.1 digest_0.6.27
## [10] htmltools_0.5.1.1 GOSemSim_2.16.1 viridis_0.5.1
## [13] GO.db_3.12.1 fansi_0.4.2 magrittr_2.0.1
## [16] memoise_2.0.0 openxlsx_4.2.3 Biostrings_2.58.0
## [19] annotate_1.68.0 graphlayouts_0.7.1 modelr_0.1.8
## [22] colorspace_2.0-0 rvest_1.0.0 blob_1.2.1
## [25] haven_2.3.1 xfun_0.22 crayon_1.4.1
## [28] RCurl_1.98-1.2 jsonlite_1.7.2 scatterpie_0.1.5
## [31] genefilter_1.72.1 survival_3.2-7 glue_1.4.2
## [34] polyclip_1.10-0 gtable_0.3.0 zlibbioc_1.36.0
## [37] XVector_0.30.0 DelayedArray_0.16.2 car_3.0-10
## [40] abind_1.4-5 scales_1.1.1 DOSE_3.16.0
## [43] DBI_1.1.1 rstatix_0.7.0 Rcpp_1.0.6
## [46] viridisLite_0.3.0 xtable_1.8-4 foreign_0.8-81
## [49] bit_4.0.4 httr_1.4.2 fgsea_1.16.0
## [52] ellipsis_0.3.1 pkgconfig_2.0.3 XML_3.99-0.5
## [55] farver_2.1.0 sass_0.3.1 dbplyr_2.1.0
## [58] locfit_1.5-9.4 utf8_1.1.4 labeling_0.4.2
## [61] tidyselect_1.1.0 rlang_0.4.10 reshape2_1.4.4
## [64] AnnotationDbi_1.52.0 munsell_0.5.0 cellranger_1.1.0
## [67] tools_4.0.4 cachem_1.0.4 cli_2.3.1
## [70] downloader_0.4 generics_0.1.0 RSQLite_2.2.3
## [73] broom_0.7.5 evaluate_0.14 fastmap_1.1.0
## [76] yaml_2.2.1 bit64_4.0.5 fs_1.5.0
## [79] tidygraph_1.2.0 zip_2.1.1 ggraph_2.0.5
## [82] xml2_1.3.2 DO.db_2.9 rstudioapi_0.13
## [85] compiler_4.0.4 curl_4.3 ggsignif_0.6.1
## [88] reprex_1.0.0 tweenr_1.0.1 geneplotter_1.68.0
## [91] bslib_0.2.4 stringi_1.5.3 highr_0.8
## [94] lattice_0.20-41 Matrix_1.3-2 vctrs_0.3.6
## [97] pillar_1.5.1 lifecycle_1.0.0 BiocManager_1.30.10
## [100] jquerylib_0.1.3 data.table_1.14.0 cowplot_1.1.1
## [103] bitops_1.0-6 qvalue_2.22.0 R6_2.5.0
## [106] gridExtra_2.3 rio_0.5.26 MASS_7.3-53.1
## [109] assertthat_0.2.1 withr_2.4.1 GenomicAlignments_1.26.0
## [112] Rsamtools_2.6.0 GenomeInfoDbData_1.2.4 hms_1.0.0
## [115] grid_4.0.4 rmarkdown_2.7 rvcheck_0.1.8
## [118] carData_3.0-4 ggforce_0.3.3 lubridate_1.7.10